source('notebooks/pbmcs/functions.R')
reticulate::use_virtualenv('gibss')
data <- prep_pbmc_data()3 PBMC GSEA Example
3.1 Running logistic SuSiE on the PBMC data
#' function for getting the gene set data
load_regression_data <- function(data, celltype, quantile, db_name){
db <- WebGestaltR::loadGeneSet(enrichDatabase = db_name)
data %>%
make_gene_list(celltype, q) %>%
gseasusie:::prepare_data()
}
#' driver function for running logistic SuSiE on pbmc data + webgestalt gene sets
driver <- function(celltype, q, db, path){
print("I'm driving here!")
data %>%
make_gene_list(celltype, q) %>%
{gseasusie::fit_gsea_susie_webgestalt(
.$gene_list,
.$gene_background,
db,
tol=1e-5,
maxiter=100,
alpha=0.5,
gamma=0.0
)} %>%
save_model(path=path)
return(T)
}Here is an example of how to use the driver function.
# the driver function fits the model and saves it, just returns true
driver(
'CD14+ Monocyte',
0.1,
'geneontology_Biological_Process',
'notebooks/pbmcs/results/cd14plusmonocyte/geneontology_Biological_Process/q=0.1.rds'
)
# load the results. Contains $fit, $data, $time
res <- readRDS('notebooks/pbmcs/results/cd14plusmonocyte/geneontology_Biological_Process/q=0.1.rds')Now we run logistic SuSiE using the top 10% of genes in each celltype. We rerun the analysis for the gene ontology subsets available through the WebGestaltR package. WebGestaltR offers various subsets of the GO, e.g. geneontology_Biological_Process, and an abridged version geneontology_Biological_Process_noRedundant.
celltypes <- names(data)
celltypes_sanitized <- c('cd14plusmonocyte', 'cd19plusb', 'cd34plus', 'cd56plusnk', 'tcell')
quantiles <- c(0.1)
db <- c(
'geneontology_Biological_Process',
'geneontology_Biological_Process_noRedundant',
'geneontology_Cellular_Component',
'geneontology_Cellular_Component_noRedundant',
'geneontology_Molecular_Function',
'geneontology_Molecular_Function_noRedundant'
)
config <- tidyr::crossing(
tibble(celltype = celltypes, celltype_sanitized=celltypes_sanitized),
q=quantiles,
db = db
)make_path <- function(celltype_sanitized, db, q){
glue::glue('notebooks/pbmcs/results/{celltype_sanitized}/{db}/q={q}.rds')
}
config %>%
dplyr::rowwise() %>%
dplyr::mutate(path = make_path(celltype_sanitized, db, q)) %>%
dplyr::mutate(run=driver(celltype, q, db, path))make_cs_tbl_single <- function(fit, l){
cs <- fit$fit$credible_sets[[l]]
tidyr::as_tibble(cs) %>%
dplyr::mutate(
geneSetIdx = cs + 1,
component = paste0('L', l),
lbf_ser = fit$fit$lbf_ser[[l]]
) %>%
dplyr::select(-cs) %>%
dplyr::left_join(fit$data$geneSets) %>%
dplyr::group_by(component, geneSet) %>%
dplyr::mutate(
geneSetSize = n(),
propInList = mean(geneInList)
) %>%
dplyr::ungroup()
}
make_cs_tbl <- function(fit, min_lbf_ser=log(10.)){
# include components with large enough lbf_ser
include_components <- which(fit$fit$lbf_ser > min_lbf_ser)
purrr::map_dfr(include_components, ~make_cs_tbl_single(fit, .x))
}
make_component_tbl_single <- function(fit, l){
# component, gene set, alpha, beta, lbf, prior_variance
with(fit$fit, tibble(
component=glue::glue('L{l}'),
geneSet = fit$data$geneSetMapping$geneSet,
alpha=alpha[l,],
beta=beta[l,],
lbf=lbf[l,],
prior_variance=prior_variance[l]))
}
make_component_tbl <- function(fit, min_lbf_ser = log(10)){
include_components <- which(fit$fit$lbf_ser > min_lbf_ser)
purrr::map_dfr(include_components, ~make_component_tbl_single(fit, .x))
}db = 'geneontology_Biological_Process'
q = 0.1
celltype = 'CD19+ B'
celltypes_sanitized_list <- as.list(celltypes_sanitized)
names(celltypes_sanitized_list) <- celltype
load_model <- function(celltype, q, db){
# load fit model
path <- glue::glue('notebooks/pbmcs/results/{celltypes_sanitized_list[[celltype]]}/{db}/q={q}.rds')
message(glue::glue('loading fit from: {path}'))
fit <- readRDS(path)
return(fit)
}
load_model2 <- function(path){
# load fit model
message(glue::glue('loading fit from: {path}'))
fit <- readRDS(path)
return(fit)
}
make_cs_tbl_nested <- function(fit){
more_go_info <- AnnotationDbi::select(GO.db::GO.db,
keys = unique(fit$data$geneSets$geneSet),
columns = c('TERM', 'DEFINITION'),
keytype = 'GOID') %>%
as_tibble() %>%
dplyr::mutate(geneSet=GOID)
more_gene_info <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
keys = unique(fit$data$geneMapping$gene),
columns = c('SYMBOL', 'GENETYPE', 'GENENAME'),
keytype = 'ENTREZID') %>%
as_tibble() %>%
dplyr::mutate(gene = ENTREZID)
gene_columns <- c('ENTREZID', 'SYMBOL', 'GENENAME', 'geneInList')
gene_set_columns <- c('geneSet', 'TERM', 'DEFINITION', 'alpha', 'beta', 'lbf', 'geneSetSize', 'propInList')
component_columns <- c('component', 'size', 'coverage', 'target_coverage', 'lbf_ser', 'prior_variance')
all_columns <- c(component_columns, gene_set_columns, gene_columns)
message('building nested credible set table')
fit %>%
make_cs_tbl() %>%
left_join(make_component_tbl(fit)) %>% # add effect estimates, etc.
left_join(more_go_info) %>%
left_join(more_gene_info) %>%
dplyr::select(all_columns) %>%
# nest gene level data
nest(.by= c(component_columns, gene_set_columns), .key='details') %>%
# nest gene set level data
nest(.by= component_columns, .key='details')
}
cs_tbl_nested <- load_model(celltype, q, db) %>%
make_cs_tbl_nested()nested_summary <- config %>%
dplyr::rowwise() %>%
dplyr::mutate(path = make_path(celltype_sanitized, db, q)) %>%
ungroup() %>%
dplyr::mutate(nested_table = purrr::map(path, ~make_cs_tbl_nested(readRDS(.x))))nested_summary %>%
saveRDS('notebooks/pbmcs/results/nested_table.rds')3.2 A table of PBMC results on GO
source('notebooks/pbmcs/functions.R')── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- readRDS('notebooks/pbmcs/results/nested_table.rds') %>%
dplyr::select(-path) %>%
dplyr::rename(details = nested_table)
reactable(
dplyr::select(data, -c(celltype_sanitized, details)),
details = make_get_details(data))